This R markdown file summarizes Simulation Study results.

OK, Now show the performance summary

A plot of HMM surface that infer tract length at boundary from each tract length condition

Now plot the two plots of lnL

Now see how estimates from the two approaches differ from the actual mean tract length in each simulated data set.

09152017

Show the PSJS estimated results for Simulated datasets with estimated tau

realized.tract.dist <- function(L, p){
  x <- 1:(L-1)
  dist.1 <- (2 + (L-1-x)*p)/(1.0/p + L -1.)*(1-p)^(x-1)
  dist.L <- 1/p/(1.0/p + L -1.)*(1-p)^(L-1)
  dist <- c(dist.1, dist.L)
  mean.L <- sum(x * dist.1) + L*dist.L
  var.L <- sum(x^2 * dist.1) + L^2*dist.L - mean.L^2
  return(list(dist = dist, mean = mean.L, sd = sqrt(var.L)))
}
Tract.list <- c(50.0, 100.0, 300.0)
for(tract in Tract.list){
  target_summary <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
  col.names <- target_summary["tract_length", ] < 10*tract
  sim_info <- get(paste("sim.tract.", toString(tract), sep = ""))
  
  hist(target_summary["tract_length", col.names], breaks = 50,
       main = paste("PSJS Estimated Tract length 1/p, Tract = ", toString(tract), ".0 ", sep = ""))
  abline(v =  realized.tract.dist(492, 1.0/tract)$mean, col = "blue")
  abline(v =  tract, col = 2)
  abline(v =  mean(sim_info["mean subtract length", ]), col = "green")
  
  hist(-log(target_summary["tract_length", col.names]), breaks = 50,
       main = paste("PSJS Estimated ln(p), Tract = ", toString(tract), ".0", sep = ""))
  abline(v = log(1.0 / realized.tract.dist(492, 1.0/tract)$mean), col = "blue")
  abline(v = log(1.0 / tract), col = 2)
  abline(v = log(1.0 / mean(sim_info["mean subtract length", ])), col = "green")
  
  
  hist((1/target_summary["tract_length", col.names]), breaks = 50,
       main = paste("PSJS Estimated p, Tract = ", toString(tract), ".0", sep = ""))
  abline(v = 1.0 / tract, col = 2)
  abline(v = 1.0 / realized.tract.dist(492, 1.0/tract)$mean, col = "blue")
  abline(v = 1.0 / mean(sim_info["mean subtract length", ]), col = "green")
  
  plot(sim_info["num IGC", ], sim_info["mean potential tract length", ], 
       main = paste("Simulated potential tract length, Tract = ", toString(tract), sep = ""), 
       xlab = "number of IGC events", ylab = "mean potential tract length")
  abline(h = tract, col = "red")
  abline(h = mean(sim_info["mean subtract length", ]), col = "green")
  
  plot(sim_info["num IGC", ], sim_info["mean realized tract length", ], 
       main = paste("Simulated realized tract length, Tract = ", toString(tract), sep = ""),
       xlab = "number of IGC events", ylab = "mean realized tract length")
  abline(h = realized.tract.dist(492, 1/tract)$mean, col = "red")
  abline(h = mean(sim_info["mean subtract length", ]), col = "green")
  
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], target_summary["tract_length", col.names],
       main = paste("PSJS estimate of Tract ", toString(tract), sep = ""),
       xlab = "number of IGC events", ylab = "PSJS estimated tract length")
  abline(h = tract, col = "red")
  abline(h = realized.tract.dist(492, 1/tract)$mean, col = "blue")
  abline(h = mean(sim_info["mean subtract length", ]), col = "green")
  
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], target_summary["tract_length", col.names],
       main = paste("PSJS estimate of Tract ", toString(tract), sep = ""),
       xlab = "number of IGC events", ylab = "PSJS estimated tract length")
  abline(h = tract, col = "red")
  abline(h = realized.tract.dist(492, 1/tract)$mean, col = "blue")
  abline(h = mean(sim_info["mean subtract length", ]), col = "green")

  plot(sim_info["num IGC with at least two variant sites", colnames(target_summary)[col.names]], 
       target_summary["tract_length", col.names],
       main = paste(" PSJS estimated vs num IGC >1 variant sites - Tract ", toString(tract), sep = ""),
       xlab = "num IGC with at least two variant sites", ylab = "PSJS estimated tract length")
  abline(h = tract, col = 2)
  abline(h = mean(sim_info["mean subtract length", ]), col = "green")
  
  # Now plot kappa estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["kappa", col.names],
       main = paste("PSJS kappa, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated kappa")
  abline(h = 14.46399, col = 2)
  
  # Now plot r2 estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["r2", col.names],
       main = paste("PSJS r2, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated r2")
  abline(h = 0.5391702, col = 2)
  
  # Now plot r3 estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["r3", col.names],
       main = paste("PSJS r3, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated r3")
  abline(h = 11.58006, col = 2)
  
  # Now plot initiation rate estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["init_rate", col.names],
       main = paste("PSJS init_rate, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated init_rate")
  abline(h = 22.58153 / tract, col = 2)
  
  # Now plot tract p estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       1.0/target_summary["tract_length", col.names],
       main = paste("PSJS tract_p, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated tract_p")
  abline(h = 1.0 / tract, col = 2)
  
  # Now plot Tau estimates
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["init_rate", col.names]*target_summary["tract_length", col.names],
       main = paste("PSJS Tau, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS estimated Tau")
  abline(h = 22.58153, col = 2) 
  
  plot(sim_info["num IGC", colnames(target_summary)[col.names]], 
       target_summary["init_rate", col.names]*target_summary["tract_length", col.names]*3.0/(1.0 +target_summary["r2", col.names] + target_summary["r3", col.names]),
       main = paste("PSJS Weighted Tau, Tract = ", toString(tract), sep = ""),
       xlab = "Num IGC", ylab = "PSJS Tau*3/(1+r2+r3)")
  abline(h = 5.16376333125, col = 2) 
  
  # Now plot initiation rate vs tract length
  plot(target_summary["tract_length", col.names], 
       target_summary["init_rate", col.names],
       main = paste("PSJS init_rate, Tract = ", toString(tract), sep = ""),
       xlab = "Tract length", ylab = "initiation rate")
  lines(sort(target_summary["tract_length", col.names]), 22.58153/sort(target_summary["tract_length", col.names]), 
        col = "red", type = "l")  
  
  plot(sim_info["mean realized tract length", colnames(target_summary)[col.names]], 
       target_summary["tract_length", col.names],
       main = paste(" mean realized vs PSJS estimated Tract ", toString(tract), sep = ""),
       xlab = "mean realized IGC tract length", ylab = "PSJS estimated")
  abline(a= 0.0, b = 1.0, col = "red") 
  
  plot(sim_info["mean potential tract length", colnames(target_summary)[col.names]], 
       target_summary["tract_length", col.names],
       main = paste(" mean potential vs PSJS estimated Tract ", toString(tract), sep = ""),
       xlab = "mean potential IGC tract length", ylab = "PSJS estimated")
  abline(a = 0.0, b = 1.0, col = 2) 
  
  plot(sim_info["mean subtract length", colnames(target_summary)[col.names]], 
       target_summary["tract_length", col.names],
       main = paste(" mean longest variant pair length vs PSJS estimated Tract ", toString(tract), sep = ""),
       xlab = "mean longest variant pair length", ylab = "PSJS estimated")
  abline(a = 0.0, b = 1.0, col = "green") 
  
  
  print(paste("Tract = ", toString(tract), ".0 combined PSJS HKY Results", sep = ""))
  print(matrix(c("Total samples", sum(col.names),
                 "mean estimates", mean(target_summary["tract_length", col.names]), 
                 "sd estimates", sd(target_summary["tract_length", col.names]),
                 "mean longest variant pair", mean(sim_info["mean subtract length", col.names]),
                 "sd longest variant pair", sd(sim_info["mean subtract length", col.names])), 2, 5))  
  
}

## [1] "Tract = 50.0 combined PSJS HKY Results"
##      [,1]            [,2]               [,3]              
## [1,] "Total samples" "mean estimates"   "sd estimates"    
## [2,] "99"            "37.3363344986544" "18.0913726165357"
##      [,4]                        [,5]                     
## [1,] "mean longest variant pair" "sd longest variant pair"
## [2,] "35.3293915861496"          "9.16677454731404"

## [1] "Tract = 100.0 combined PSJS HKY Results"
##      [,1]            [,2]               [,3]              
## [1,] "Total samples" "mean estimates"   "sd estimates"    
## [2,] "98"            "63.1609458480232" "30.7380734274971"
##      [,4]                        [,5]                     
## [1,] "mean longest variant pair" "sd longest variant pair"
## [2,] "68.6059345139295"          "21.3807703280689"

## [1] "Tract = 300.0 combined PSJS HKY Results"
##      [,1]            [,2]               [,3]              
## [1,] "Total samples" "mean estimates"   "sd estimates"    
## [2,] "97"            "180.281703030374" "291.767287056941"
##      [,4]                        [,5]                     
## [1,] "mean longest variant pair" "sd longest variant pair"
## [2,] "167.215846779483"          "58.843309625813"

Now look back at the simulation study of published work to see if Tau estimate is linear with # IGC

IGC.path <- "/Users/xji3/FromCluster_IGCCodonSimulation01262016/"
paml.path <- "/Users/xji3/GitFolders/IGCCodonSimulation/"
IGC.geo.list <- c(3.0, 10.0, 50.0, 100.0, 500.0)
num.sim <- 100
paralog <- "YDR418W_YEL054C"
sim.path <- paste("SimulationSummary", paralog, "",sep = "/")
for (IGC.geo in IGC.geo.list){
  summary_mat <- NULL
  IGC.geo.path <- paste("IGCgeo_", toString(IGC.geo), ".0/", sep = "")
  file.prefix <- paste(paralog, "MG94_geo", paste(toString(IGC.geo), ".0", sep = ""), "Sim", sep = "_")
  file.suffix <- "summary.txt"
  for (sim.num in 0:(num.sim - 1)){
    file.name <- paste(file.prefix, toString(sim.num), file.suffix, sep = "_")
    summary_file <- paste(IGC.path, sim.path, IGC.geo.path, file.name, sep = "")
    if (file.exists(summary_file)){
      all <- readLines(summary_file, n = -1)
      col.names <- paste("geo", paste(toString(IGC.geo), ".0", sep = ""), "sim", toString(sim.num), sep = "_")
      row.names <- strsplit(all[length(all)], ' ')[[1]][-1]
      summary_mat <- cbind(summary_mat, as.matrix(read.table(summary_file, 
                                                             row.names = row.names, 
                                                             col.names = col.names)))        
    }
  }
  assign(paste("geo", paste(toString(IGC.geo), ".0", sep = ""), "summary", sep = "_"), summary_mat)
}

# Now read in Total number of IGC events in simulation
edge.list <- c(#"N0_kluyveri", outgroup branch has no IGC events
  "N0_N1", "N1_castellii", "N1_N2", "N2_bayanus", 
  "N2_N3", "N3_kudriavzevii", "N3_N4", "N4_mikatae", "N4_N5", 
  "N5_cerevisiae", "N5_paradoxus")
nIGC.mat <- NULL
for(IGC.geo in IGC.geo.list){
  summary_mat <- NULL
  IGC.geo.path <- paste("IGCgeo_", toString(IGC.geo), ".0/sim_", sep = "")
  for(sim.num in 0:(num.sim - 1)){
    num.IGC <- 0
    for (edge in edge.list){
      file.name <- paste(IGC.geo.path, toString(sim.num), "/log/", edge, "_log.log", sep = "")
      log.file <- paste(IGC.path, paralog, "/", file.name, sep = "")
      content <- readLines(log.file)
      if (length(content)>6){
        for(i in 7:length(content)){
          line.cont <- strsplit(content[i], " ")
          if(line.cont[[1]][1] == "IGC"){
            num.IGC <- num.IGC + 1
          }
        }
      }
      
    }
    summary_mat <- c(summary_mat, num.IGC)
  }
  nIGC.mat <- rbind(nIGC.mat, summary_mat)
}
rownames(nIGC.mat) <- paste("IGCgeo_", IGC.geo.list, ".0", sep = "")
colnames(nIGC.mat) <- paste("sim_", 0:99, sep = "")

for(IGC.geo in IGC.geo.list){
  summary_mat <- get(paste("geo_", IGC.geo, ".0_summary", sep = ""))
  # Now plot Tau estimates
  plot(nIGC.mat[paste("IGCgeo_", IGC.geo, ".0", sep = ""),], 
       summary_mat["tau", ],
       main = paste("PSJS Tau, Tract = ", toString(IGC.geo), sep = ""),
       xlab = "Num IGC", ylab = "IS MG94 estimated Tau")
  abline(h = 1.409408, col = 2)   
  
}

for(tract in Tract.list){
  sim.info <- get(paste("sim.tract.", toString(tract), sep = ""))
  PSJS.info <- get(paste("PSJS_HKY_Tract_", toString(tract), "_combined_summary", sep = ""))
  shared.col <- colnames(PSJS.info)[PSJS.info["tract_length", ] < 10000 & PSJS.info["tract_length", ] > 1.]
  
  # Now show the ratio of PSJS estimated tract / actual mean tracts in simulation
  PSJS.ratio <- PSJS.info["tract_length", shared.col]/sim.info[1, shared.col]
  hist(PSJS.ratio, breaks = 50, main = paste("PSJS ratio Tract = ", toString(tract), sep = ""))
  print(matrix(c("Tract", tract, "PSJS mean", mean(PSJS.ratio), "PSJS sd", sd(PSJS.ratio)), 2, 3))
}

##      [,1]    [,2]               [,3]               
## [1,] "Tract" "PSJS mean"        "PSJS sd"          
## [2,] "50"    "1.37891029414217" "0.669068302333004"

##      [,1]    [,2]               [,3]              
## [1,] "Tract" "PSJS mean"        "PSJS sd"         
## [2,] "100"   "4.54891352050856" "2.94058720305058"

##      [,1]    [,2]              [,3]              
## [1,] "Tract" "PSJS mean"       "PSJS sd"         
## [2,] "300"   "41.117604013692" "118.369860437309"

save workspace now

save.image("./SimulationStudy.RData")